5 Metagenomic data

Species-specific community analyses conducted to generate the data included in these analyses can be found in the annex.

5.1 Library complexity

Nonpareil estimate of the metagenomic complexity after removing host DNA.

all_data %>%
    select(dataset,Extraction,nonpareil,Taxon) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of breadth of host genome coverage")
tinytable_9hpca6vkb9d0l8f4lw2x
Mean and standard deviation of breadth of host genome coverage
Taxon DREX EHEX ZYMO
Amphibian 0.8±0.1 0.8±0.1 0.9±0.1
Bird 1.0±0.0 0.8±0.4 0.9±0.1
Control 0.5±0.6 0.5±0.7 0.0±0.0
Mammal 0.8±0.1 0.7±0.3 0.8±0.2
Reptile 0.9±0.0 0.9±0.1 0.9±0.1
all_data %>%
    select(dataset,Extraction,nonpareil,Taxon) %>%
    unique() %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal","Bird","Control"))) %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    ggplot(aes(x=Extraction,y=nonpareil))+ 
        geom_boxplot() + 
        facet_grid(. ~ Taxon, scales = "free") +
        labs(y="Nonpareil completeness",x="Extraction method")

all_data  %>%
    select(dataset,Extraction,Sample,Species,nonpareil,Taxon) %>%
    unique() %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(nonpareil ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_o1m91po2hkp3ez668rg9
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 0.885552853 0.03450229 25.6664948 40.89542 7.753071e-27
fixed NA ExtractionDREX 0.005536892 0.04336611 0.1276779 48.00020 8.989373e-01
fixed NA ExtractionEHEX -0.112193866 0.04336611 -2.5871326 48.00020 1.276294e-02
ran_pars Sample sd__(Intercept) 0.059565487 NA NA NA NA
ran_pars Species sd__(Intercept) 0.035030813 NA NA NA NA
ran_pars Residual sd__Observation 0.150224598 NA NA NA NA

5.2 Alpha diversity

Variance partitioning metrics are derived from community_analysis.Rmd.

alpha_data <- list.files(path = "results", pattern = "^alpha_.*\\.tsv$", full.names = TRUE) %>%
  map_df(~ read_tsv(.)) %>%
  left_join(all_data,by= join_by(dataset==dataset))
alpha_data %>%
    pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
    mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    ggplot(aes(x=Extraction,y=value))+ 
        geom_boxplot() + 
        facet_grid(metric ~ Taxon, scales = "free")

5.2.1 Richness

alpha_data %>%
    select(dataset,Extraction,Sample,Species,richness,Taxon) %>%
    unique() %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    lmerTest::lmer(richness ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_pjvr89dadesyvmnjddm5
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 59.44444444 10.715517 5.5475108 10.12322 0.000234351
fixed NA ExtractionDREX 0.05555556 4.453352 0.0124750 34.97883 0.990117532
fixed NA ExtractionEHEX 1.38257472 4.545332 0.3041746 35.13134 0.762789322
ran_pars Sample sd__(Intercept) 16.00359332 NA NA NA NA
ran_pars Species sd__(Intercept) 28.56742258 NA NA NA NA
ran_pars Residual sd__Observation 13.36005511 NA NA NA NA

5.2.2 Neutral

alpha_data %>%
    select(dataset,Extraction,Sample,Species,neutral,Taxon) %>%
    unique() %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    lmerTest::lmer(neutral ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_08bhf8uublqlvroqt3fp
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 29.5413181 5.139162 5.7482754 9.895197 0.000193227
fixed NA ExtractionDREX -2.1102758 1.922343 -1.0977625 35.042971 0.279794105
fixed NA ExtractionEHEX -0.3422373 1.962694 -0.1743712 35.152526 0.862574163
ran_pars Sample sd__(Intercept) 8.6429526 NA NA NA NA
ran_pars Species sd__(Intercept) 13.5543072 NA NA NA NA
ran_pars Residual sd__Observation 5.7670282 NA NA NA NA

5.2.3 Phylogenetic

alpha_data %>%
    select(dataset,Extraction,Sample,Species,phylogenetic,Taxon) %>%
    unique() %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    lmerTest::lmer(phylogenetic ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_etsa3nifli19n6jldv5m
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 4.7721984 0.4324985 11.034024 9.653749 8.685116e-07
fixed NA ExtractionDREX 0.1521100 0.1403528 1.083769 35.015284 2.858737e-01
fixed NA ExtractionEHEX 0.1852709 0.1433723 1.292236 35.056809 2.047290e-01
ran_pars Sample sd__(Intercept) 1.2179889 NA NA NA NA
ran_pars Species sd__(Intercept) 0.9236345 NA NA NA NA
ran_pars Residual sd__Observation 0.4210584 NA NA NA NA

5.3 Microbial complexity recovery

all_data %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(damr), sd(damr))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of breadth of host genome coverage")
tinytable_scajeu357qawp1ypxx2g
Mean and standard deviation of breadth of host genome coverage
Taxon DREX EHEX ZYMO
Amphibian 0.8±0.3 0.8±0.3 0.8±0.4
Bird 0.5±0.4 0.6±0.4 0.6±0.4
Control 1.0±0.0 1.0±0.0 1.0±0.0
Mammal 0.9±0.1 0.8±0.3 0.8±0.1
Reptile 1.0±0.0 1.0±0.0 0.9±0.1
all_data %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal","Bird","Control"))) %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    ggplot(aes(x=Extraction,y=damr))+ 
        geom_boxplot() + 
        facet_grid(. ~ Taxon, scales = "free") +
        labs(y="Domain-adjusted mapping rate",x="Extraction method")

all_data  %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon, Sample, Species) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction*100))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(damr ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_vkbc5i6m2y6f82drwgac
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 0.78271978 0.06660971 11.7508366 15.45687 4.117676e-09
fixed NA ExtractionDREX 0.03348421 0.03977983 0.8417385 130.19072 4.014779e-01
fixed NA ExtractionEHEX 0.02228400 0.03999367 0.5571881 130.24817 5.783552e-01
ran_pars Sample sd__(Intercept) 0.12024342 NA NA NA NA
ran_pars Species sd__(Intercept) 0.19047224 NA NA NA NA
ran_pars Residual sd__Observation 0.20283813 NA NA NA NA

5.4 Variance partitioning

Variance partitioning metrics are derived from community_analysis.Rmd.

variance_data <- list.files(path = "results", pattern = "^var_.*\\.tsv$", full.names = TRUE) %>%
  map_df(~ read_tsv(.))

variance_data %>% summarise(mean=mean(r2),sd=sd(r2))
# A tibble: 1 × 2
   mean     sd
  <dbl>  <dbl>
1 0.102 0.0768
variance_data %>%
    left_join(sample_metadata %>% select(Species,Taxon) %>% unique(),by=join_by(species==Species)) %>%
    mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
    ggplot(aes(x=Taxon,y=r2)) +
    geom_boxplot()+
    ylim(0,1)+
    facet_grid(. ~ metric, scales = "free")

variance_data %>%
    group_by(metric) %>%
    summarise(mean=mean(r2),sd=sd(r2)) %>%
    tt()
tinytable_6v6g91e1qhtf8g46454r
metric mean sd
neutral 0.06201935 0.04744380
phylogenetic 0.07381277 0.06884687
richness 0.16970818 0.06626388

5.5 Combined community analysis

species="combined"
genus=species

sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
    rename(dataset=Dataset)

read_counts <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_counts.tsv.gz") %>%
  rename(genome = 1)

genome_metadata <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_mag_info.tsv.gz") %>%
  rename(genome = 1, length=mag_size)

genome_coverage <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_coverage.tsv.gz") %>%
  rename(genome = 1)

download.file("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134.tree.gz", "data/DMB0134.tree.gz")
genome_tree <- read_tree("data/DMB0134.tree.gz")

5.5.1 Filter data

#Filter by coverage
min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]]))

# Transform  to genome counts (non-filtered)
readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

# Transform to genome counts (coverage-filtered)
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

5.5.2 Community barplot

# Retrieve EHI taxonomy colors
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    select(colors) %>%
    pull()

# Stacked barplot
genome_counts_filt %>%
    mutate_at(vars(-genome), funs(./sum(., na.rm = TRUE)))  %>% #apply TSS nornalisation
    pivot_longer(-genome, names_to = "dataset", values_to = "count") %>%
    left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append taxonomy
    left_join(., sample_metadata, by = join_by(dataset == dataset)) %>% #append taxonomy
    mutate(Extraction=factor(Extraction,levels=c("ZYMO","DREX","EHEX"))) %>%
    filter(Taxon != "Control") %>%
    ggplot(., aes(x=dataset,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
          geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
          scale_fill_manual(values=phylum_colors) +
          labs(y = "Relative abundance") +
          facet_nested(. ~ Taxon + Species + Sample + Extraction, scales="free_x") +
          guides(fill = guide_legend(ncol = 3)) +
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          axis.title.x = element_blank(),
                panel.background = element_blank(),
                panel.border = element_blank(),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
                legend.position="none",
                legend.title=element_blank(),
                panel.spacing = unit(0, "lines"))

ggsave(paste0("figures/barplot_",genus,".pdf"))
genome_counts_NMDS <- genome_counts_filt %>%
            column_to_rownames(var="genome") %>%
            select(where(~!all(. == 0))) %>%
            hillpair(.,q=1, metric="C", out="dist") %>%
            metaMDS(.,trymax = 999, k=2, trace=0) %>%
            vegan::scores() %>%
            as_tibble(., rownames = "dataset") %>%
            left_join(sample_metadata, by = join_by(dataset == dataset)) %>%
            filter(Taxon != "Control") %>%
            group_by(Sample) %>%
            mutate(sample_x=mean(NMDS1), sample_y=mean(NMDS2))

genome_counts_NMDS %>%
            ggplot(., aes(x=NMDS1,y=NMDS2, color=Species, shape=Extraction)) +
                scale_color_manual(values=c("#92d4d1","#9cba7b","#d9d88d","#40bfb8","#73a842","#c2c935","#bf7777","#4a7135","#25968e","#aa3333","#5e1717","#8d8e35")) +
                geom_point(size=3, alpha=0.8) +
                geom_segment(aes(x=sample_x, y=sample_y, xend=NMDS1, yend=NMDS2), alpha=0.2) +
                theme_classic() +
                theme(legend.position="right", legend.box="vertical") +
                guides(color=guide_legend(title="Species"))